#setwd('/afs/inf.ed.ac.uk/user/s17/s1725186/Documents/PhD-Models/FirstPUModel/RMarkdowns')

library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly) ; library(dendextend)
library(RColorBrewer) ; library(viridis) ; require(gridExtra) ; library(GGally)
library(expss)
library(polycor)
library(foreach) ; library(doParallel)
library(knitr)
library(biomaRt)
suppressWarnings(suppressMessages(library(WGCNA)))

SFARI_colour_hue = function(r) {
  pal = c('#FF7631','#FFB100','#E8E328','#8CC83F','#62CCA6','#59B9C9','#b3b3b3','#808080','gray','#d9d9d9')[r]
}

Load preprocessed dataset (preprocessing code in 19_10_14_data_preprocessing.Rmd) and clustering (pipeline in 19_10_21_WGCNA.Rmd)

# Gandal dataset
load('./../Data/Gandal/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
DE_info = DE_info %>% data.frame


# GO Neuronal annotations
GO_annotations = read.csv('./../../../PhD-InitialExperiments/FirstYearReview/Data/GO_annotations/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>% 
              mutate('ID'=as.character(ensembl_gene_id)) %>% 
              dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
              mutate('Neuronal'=1)


# SFARI Genes
SFARI_genes = read_csv('./../Data/SFARI/SFARI_genes_08-29-2019_with_ensembl_IDs.csv')
SFARI_genes = SFARI_genes[!duplicated(SFARI_genes$ID) & !is.na(SFARI_genes$ID),]


# Clusterings
clusterings = read_csv('./../Data/Gandal/clusters.csv')


# Update DE_info with SFARI and Neuronal information
genes_info = DE_info %>% mutate('ID'=rownames(.)) %>% left_join(SFARI_genes, by='ID') %>% 
  mutate(`gene-score`=ifelse(is.na(`gene-score`), 'None', `gene-score`)) %>%
  left_join(GO_neuronal, by='ID') %>% left_join(clusterings, by='ID') %>%
  mutate(Neuronal=ifelse(is.na(Neuronal), 0, Neuronal)) %>%
  mutate(gene.score=ifelse(`gene-score`=='None' & Neuronal==1, 'Neuronal', `gene-score`), 
         significant=padj<0.05 & !is.na(padj))

# Add gene symbol
getinfo = c('ensembl_gene_id','external_gene_id')
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL', dataset='hsapiens_gene_ensembl',
               host='feb2014.archive.ensembl.org') ## Gencode v19
gene_names = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), values=genes_info$ID, mart=mart)

genes_info = genes_info %>% left_join(gene_names, by=c('ID'='ensembl_gene_id'))


clustering_selected = 'DynamicHybrid'
genes_info$Module = genes_info[,clustering_selected]

dataset = read.csv(paste0('./../Data/Gandal/dataset_', clustering_selected, '.csv'))
dataset$Module = dataset[,clustering_selected]


rm(DE_info, GO_annotations, clusterings)

Relation to external clinical traits

Quantifying module-trait associations

Using the hetcor function, that calculates Pearson, polyserial or polychoric correlations depending on the type of variables involved.

datTraits = datMeta %>% dplyr::select(Diagnosis_, Region, Sex, Age, PMI, RNAExtractionBatch) %>%
            rename('Diagnosis' = Diagnosis_, 'ExtractionBatch' = RNAExtractionBatch)

# Recalculate MEs with color labels
ME_object = datExpr %>% t %>% moduleEigengenes(colors = genes_info$Module)
MEs = orderMEs(ME_object$eigengenes)

# Calculate correlation between eigengenes and the traits and their p-values
moduleTraitCor = MEs %>% apply(2, function(x) hetcor(x, datTraits)$correlations[1,-1]) %>% t
rownames(moduleTraitCor) = colnames(MEs)
colnames(moduleTraitCor) = colnames(datTraits)
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nrow(datExpr))

# Create text matrix for the Heatmap
textMatrix = paste0(signif(moduleTraitCor, 2), ' (', signif(moduleTraitPvalue, 1), ')')
dim(textMatrix) = dim(moduleTraitCor)

# In case there are any NAs
if(sum(!complete.cases(moduleTraitCor))>0){
  print(paste0(sum(is.na(moduleTraitCor)),' correlation(s) could not be calculated')) 
}

rm(ME_object)

I’m going to select all the modules that have an absolute correlation higher than 0.9 with Diagnosis to study them

moduleTraitCor = moduleTraitCor[complete.cases(moduleTraitCor),]
moduleTraitPvalue = moduleTraitPvalue[complete.cases(moduleTraitCor),]

# Sort moduleTraitCor by Diagnosis
moduleTraitCor = moduleTraitCor[order(moduleTraitCor[,1], decreasing=TRUE),]
moduleTraitPvalue = moduleTraitPvalue[order(moduleTraitCor[,1], decreasing=TRUE),]

# Create text matrix for the Heatmap
textMatrix = paste0(signif(moduleTraitCor, 2), ' (', signif(moduleTraitPvalue, 1), ')')
dim(textMatrix) = dim(moduleTraitCor)


labeledHeatmap(Matrix = moduleTraitCor, xLabels = names(datTraits), yLabels =  gsub('ME','',rownames(moduleTraitCor)), 
               yColorWidth=0, colors = brewer.pal(11,'PiYG'), bg.lab.y = gsub('ME','',rownames(moduleTraitCor)),
               textMatrix = textMatrix, setStdMargins = FALSE, cex.text = 0.8, cex.lab.y = 0.75, zlim = c(-1,1),
               main = paste('Module-Trait relationships'))

diagnosis_cor = data.frame('Module' = gsub('ME','',rownames(moduleTraitCor)),
                           'MTcor' = moduleTraitCor[,'Diagnosis'],
                           'MTpval' = moduleTraitPvalue[,'Diagnosis'])

genes_info = genes_info %>% left_join(diagnosis_cor, by='Module')

rm(moduleTraitPvalue, datTraits, textMatrix, diagnosis_cor)

Studying the modules with the highest absolute correlation to Diagnosis

I was expecting the modules to consist mainly of genes with the highest LFC (largest absolute values in PC2), and they are large, but many of the largest genes are not in these modules.

top_modules = gsub('ME','',rownames(moduleTraitCor)[abs(moduleTraitCor[,'Diagnosis'])>0.9])

cat(paste0('Top modules selected: ', paste(top_modules, collapse=', '),'\n'))
## Top modules selected: #9B8EFF, #F066EA, #8DAB00
pca = datExpr %>% prcomp

plot_data = data.frame('ID'=rownames(datExpr), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2]) %>%
            left_join(dataset, by='ID') %>% dplyr::select(ID, PC1, PC2, Module, gene.score) %>%
            mutate(ImportantModules = ifelse(Module %in% top_modules, as.character(Module), 'Others')) %>%
            mutate(color = ifelse(ImportantModules=='Others','gray',ImportantModules),
                   alpha = ifelse(ImportantModules=='Others', 0.2, 0.4))

table(plot_data$ImportantModules)
## 
## #8DAB00 #9B8EFF #F066EA  Others 
##    1277     215    1158   13950
ggplotly(plot_data %>% ggplot(aes(PC1, PC2, color=ImportantModules)) + 
         geom_point(alpha=plot_data$alpha, color=plot_data$color, aes(ID=ID)) + theme_minimal() + 
           ggtitle('Modules with strongest relation to Diagnosis'))
rm(pca)

Module Membership vs Gene Significance

create_plot = function(module){
  
  plot_data = dataset %>% dplyr::select(ID, paste0('MM.',gsub('#','',module)), GS, gene.score) %>% filter(dataset$Module==module)
  colnames(plot_data)[2] = 'Module'
  
  SFARI_colors = as.numeric(names(table(as.character(plot_data$gene.score)[plot_data$gene.score!='None'])))
  
  p = ggplotly(plot_data %>% ggplot(aes(Module, GS, color=gene.score)) + geom_point(alpha=0.5, aes(ID=ID)) + ylab('Gene Significance') +
               scale_color_manual(values=SFARI_colour_hue(r=c(SFARI_colors,8))) + theme_minimal() + xlab('Module Membership') +
               ggtitle(paste0('Module ', module,'  (MTcor = ', round(moduleTraitCor[paste0('ME',module),1],2),')')))
  
  return(p)
}

create_plot(top_modules[1])
create_plot(top_modules[2])
create_plot(top_modules[3])
rm(create_plot)

Modules with the strongest module-diagnosis correlation should have the highest percentage of SFARI Genes, but this doesn’t seem to be the case (even the opposite may be true)

plot_data = dataset %>% mutate('hasSFARIscore' = gene.score!='None') %>% 
            group_by(Module, MTcor, hasSFARIscore) %>% summarise(p=n()) %>% 
            left_join(dataset %>% group_by(Module) %>% summarise(n=n()), by='Module') %>% 
            mutate(p=round(p/n*100,2)) 

for(i in 1:nrow(plot_data)){
  this_row = plot_data[i,]
  if(this_row$hasSFARIscore==FALSE & this_row$p==100){
    new_row = this_row
    new_row$hasSFARIscore = TRUE
    new_row$p = 0
    plot_data = plot_data %>% rbind(new_row)
  }
}

plot_data = plot_data %>% filter(hasSFARIscore==TRUE)

ggplotly(plot_data %>% ggplot(aes(MTcor, p, size=n)) + geom_smooth(color='gray', se=FALSE) +
         geom_point(color=plot_data$Module, alpha=0.5, aes(id=Module)) + geom_hline(yintercept=mean(plot_data$p), color='gray') +
         xlab('Module-Diagnosis correlation') + ylab('% of SFARI genes') +
         theme_minimal() + theme(legend.position = 'none'))
rm(i, this_row, new_row, plot_data)

Module Eigengenes

Since these modules have the strongest relation to autism, this pattern should be reflected in their model eigengenes, having two different behaviours for the samples corresponding to autism and the ones corresponding to control.

In all three cases, the Eigengenes separate the behaviour between autism and control samples very clearly!

plot_EGs = function(module){

  plot_data = data.frame('ID' = rownames(MEs), 'MEs' = MEs[,paste0('ME',module)], 'Diagnosis' = datMeta$Diagnosis_)

  p = plot_data %>% ggplot(aes(Diagnosis, MEs, fill=Diagnosis)) + geom_boxplot() + theme_minimal() + theme(legend.position='none') +
                    ggtitle(paste0('Module ', module, '  (MTcor=',round(moduleTraitCor[paste0('ME',module),1],2),')'))
  return(p)
}

p1 = plot_EGs(top_modules[1])
p2 = plot_EGs(top_modules[2])
p3 = plot_EGs(top_modules[3])

grid.arrange(p1, p2, p3, nrow=1)

rm(plot_EGs, p1, p2, p3)

Identifying important genes

Selecting the modules with the highest correlation to Diagnosis, and, from them, the genes with the highest module membership-(absolute) gene significance

*Ordered by \(\frac{MM+|GS|}{2}\)

Only two SFARI Genes in the three lists, none from scores 1 and 2…

create_table = function(module){
  top_genes = dataset %>% left_join(genes_info %>% dplyr::select(ID, external_gene_id), by='ID') %>% 
              dplyr::select(external_gene_id, paste0('MM.',gsub('#','',module)), GS, gene.score) %>%
              filter(dataset$Module==module) %>% rename('MM' = paste0('MM.',gsub('#','',module))) %>% 
              mutate(importance = (MM+abs(GS))/2) %>% arrange(by=-importance) %>% top_n(10)
  return(top_genes)
}

top_genes_1 = create_table(top_modules[1])
kable(top_genes_1, caption=paste0('Top 10 genes for module ', top_modules[1], '  (MTcor = ',
                                  round(moduleTraitCor[paste0('ME',top_modules[1]),1],2),')'))
Top 10 genes for module #9B8EFF (MTcor = 0.98)
external_gene_id MM GS gene.score importance
MCL1 0.8768314 0.9060349 None 0.8914332
ITGA5 0.8613780 0.8646717 None 0.8630249
TOB2 0.7916638 0.8719542 None 0.8318090
SRGAP1 0.7680729 0.8951553 None 0.8316141
FOXO1 0.7883168 0.8458336 None 0.8170752
ITPRIP 0.8218756 0.8042687 None 0.8130721
LATS2 0.7719291 0.8433350 None 0.8076321
ZNF516 0.7243000 0.8764275 None 0.8003638
PARD3 0.6952386 0.8949607 None 0.7950996
PLEKHG1 0.7711860 0.8073816 None 0.7892838
top_genes_2 = create_table(top_modules[2])
kable(top_genes_2, caption=paste0('Top 10 genes for module ', top_modules[2], '  (MTcor = ',
                                  round(moduleTraitCor[paste0('ME',top_modules[2]),1],2),')'))
Top 10 genes for module #F066EA (MTcor = 0.91)
external_gene_id MM GS gene.score importance
HIST1H2AC 0.8413858 0.8648084 None 0.8530971
ZBTB20 0.8640272 0.8406765 3 0.8523519
CFLAR 0.8807946 0.8010251 None 0.8409098
HIST2H2BE 0.7903735 0.8878029 None 0.8390882
ELF1 0.7921926 0.7836121 None 0.7879024
KANK1 0.8162030 0.7554489 4 0.7858260
REST 0.8091856 0.7472778 None 0.7782317
SFMBT2 0.8039142 0.7457312 None 0.7748227
ZNF334 0.7958118 0.7482581 None 0.7720349
HEBP2 0.7890904 0.7462823 None 0.7676863
top_genes_3 = create_table(top_modules[3])
kable(top_genes_3, caption=paste0('Top 10 genes for module ', top_modules[3],'  (MTcor = ',
                                  round(moduleTraitCor[paste0('ME',top_modules[3]),1],2),')'))
Top 10 genes for module #8DAB00 (MTcor = -0.92)
external_gene_id MM GS gene.score importance
MAPK9 0.9014380 -0.9399810 None 0.9207095
TRIM37 0.9067664 -0.9250494 None 0.9159079
PREPL 0.8923376 -0.8966280 None 0.8944828
NAP1L5 0.8590945 -0.8915541 None 0.8753243
EIF5A2 0.8377986 -0.9116043 None 0.8747015
DIRAS1 0.8551447 -0.8936753 None 0.8744100
ATP6V1C1 0.8548700 -0.8885175 None 0.8716938
TTBK2 0.8740293 -0.8654230 None 0.8697261
PRKCE 0.8345108 -0.9028094 None 0.8686601
ATP6V1A 0.8277575 -0.9068579 None 0.8673077
rm(create_table)
pca = datExpr %>% prcomp

plot_data = data.frame('ID'=rownames(datExpr), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2]) %>%
            left_join(dataset, by='ID') %>% dplyr::select(ID, PC1, PC2, Module, gene.score) %>%
            mutate(color = ifelse(Module %in% top_modules, as.character(Module), 'gray')) %>%
            mutate(alpha = ifelse(color %in% top_modules & 
                                  ID %in% c(as.character(top_genes_1$ID), 
                                            as.character(top_genes_2$ID),
                                            as.character(top_genes_3$ID)), 1, 0.1))

plot_data %>% ggplot(aes(PC1, PC2)) + geom_point(alpha=plot_data$alpha, color=plot_data$color) + 
              theme_minimal() + ggtitle('Important genes identified through WGCNA')



Session info

sessionInfo()
## R version 3.6.0 (2019-04-26)
## Platform: x86_64-redhat-linux-gnu (64-bit)
## Running under: Scientific Linux 7.6 (Nitrogen)
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib64/R/lib/libRblas.so
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
##  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] WGCNA_1.68            fastcluster_1.1.25    dynamicTreeCut_1.63-1
##  [4] biomaRt_2.42.0        knitr_1.24            doParallel_1.0.15    
##  [7] iterators_1.0.12      foreach_1.4.7         polycor_0.7-10       
## [10] expss_0.10.1          GGally_1.4.0          gridExtra_2.3        
## [13] viridis_0.5.1         viridisLite_0.3.0     RColorBrewer_1.1-2   
## [16] dendextend_1.13.2     plotly_4.9.1          glue_1.3.1           
## [19] reshape2_1.4.3        forcats_0.4.0         stringr_1.4.0        
## [22] dplyr_0.8.3           purrr_0.3.3           readr_1.3.1          
## [25] tidyr_1.0.0           tibble_2.1.3          ggplot2_3.2.1        
## [28] tidyverse_1.3.0      
## 
## loaded via a namespace (and not attached):
##   [1] readxl_1.3.1                backports_1.1.5            
##   [3] Hmisc_4.2-0                 BiocFileCache_1.10.2       
##   [5] plyr_1.8.5                  lazyeval_0.2.2             
##   [7] splines_3.6.0               crosstalk_1.0.0            
##   [9] BiocParallel_1.20.1         GenomeInfoDb_1.22.0        
##  [11] robust_0.4-18.2             digest_0.6.23              
##  [13] htmltools_0.4.0             GO.db_3.10.0               
##  [15] fansi_0.4.1                 magrittr_1.5               
##  [17] checkmate_1.9.4             memoise_1.1.0              
##  [19] fit.models_0.5-14           cluster_2.0.8              
##  [21] annotate_1.64.0             modelr_0.1.5               
##  [23] matrixStats_0.55.0          askpass_1.1                
##  [25] prettyunits_1.0.2           colorspace_1.4-1           
##  [27] blob_1.2.0                  rvest_0.3.5                
##  [29] rappdirs_0.3.1              rrcov_1.4-7                
##  [31] haven_2.2.0                 xfun_0.8                   
##  [33] crayon_1.3.4                RCurl_1.95-4.12            
##  [35] jsonlite_1.6                genefilter_1.68.0          
##  [37] zeallot_0.1.0               impute_1.60.0              
##  [39] survival_2.44-1.1           gtable_0.3.0               
##  [41] zlibbioc_1.32.0             XVector_0.26.0             
##  [43] DelayedArray_0.12.2         BiocGenerics_0.32.0        
##  [45] DEoptimR_1.0-8              scales_1.1.0               
##  [47] mvtnorm_1.0-11              DBI_1.1.0                  
##  [49] Rcpp_1.0.3                  xtable_1.8-4               
##  [51] progress_1.2.2              htmlTable_1.13.1           
##  [53] foreign_0.8-71              bit_1.1-15.1               
##  [55] preprocessCore_1.48.0       Formula_1.2-3              
##  [57] stats4_3.6.0                htmlwidgets_1.5.1          
##  [59] httr_1.4.1                  ellipsis_0.3.0             
##  [61] acepack_1.4.1               farver_2.0.3               
##  [63] pkgconfig_2.0.3             reshape_0.8.8              
##  [65] XML_3.98-1.20               nnet_7.3-12                
##  [67] dbplyr_1.4.2                locfit_1.5-9.1             
##  [69] later_1.0.0                 labeling_0.3               
##  [71] tidyselect_0.2.5            rlang_0.4.2                
##  [73] AnnotationDbi_1.48.0        munsell_0.5.0              
##  [75] cellranger_1.1.0            tools_3.6.0                
##  [77] cli_2.0.1                   generics_0.0.2             
##  [79] RSQLite_2.2.0               broom_0.5.3                
##  [81] fastmap_1.0.1               evaluate_0.14              
##  [83] yaml_2.2.0                  bit64_0.9-7                
##  [85] fs_1.3.1                    robustbase_0.93-5          
##  [87] nlme_3.1-139                mime_0.8                   
##  [89] xml2_1.2.2                  compiler_3.6.0             
##  [91] rstudioapi_0.10             curl_4.3                   
##  [93] reprex_0.3.0                geneplotter_1.64.0         
##  [95] pcaPP_1.9-73                stringi_1.4.5              
##  [97] highr_0.8                   lattice_0.20-38            
##  [99] Matrix_1.2-17               vctrs_0.2.1                
## [101] pillar_1.4.3                lifecycle_0.1.0            
## [103] data.table_1.12.8           bitops_1.0-6               
## [105] httpuv_1.5.2                GenomicRanges_1.38.0       
## [107] R6_2.4.1                    latticeExtra_0.6-28        
## [109] promises_1.1.0              IRanges_2.20.2             
## [111] codetools_0.2-16            MASS_7.3-51.4              
## [113] assertthat_0.2.1            SummarizedExperiment_1.16.1
## [115] openssl_1.4.1               DESeq2_1.26.0              
## [117] withr_2.1.2                 S4Vectors_0.24.2           
## [119] GenomeInfoDbData_1.2.2      hms_0.5.3                  
## [121] grid_3.6.0                  rpart_4.1-15               
## [123] rmarkdown_1.14              Cairo_1.5-10               
## [125] shiny_1.4.0                 Biobase_2.46.0             
## [127] lubridate_1.7.4             base64enc_0.1-3